{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "43defe65",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import lxml.etree as le\n",
    "import pandas as pd\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "c0406c92",
   "metadata": {},
   "outputs": [],
   "source": [
    "url = 'https://www.runoob.com/html/html-tutorial.html'\n",
    "x = '//div[@class=\"design\"]//a'\n",
    "domain = 'https://www.runoob.com'\n",
    "\n",
    "content = requests.get(url=url).content\n",
    "contenttx = le.HTML(content)\n",
    "rets = contenttx.xpath(x)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "5de5f0f1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<Element a at 0x1b79f318800>, <Element a at 0x1b79f318740>, <Element a at 0x1b79f581380>, <Element a at 0x1b79f5818c0>, <Element a at 0x1b79f581900>, <Element a at 0x1b79f581f80>, <Element a at 0x1b79f581440>, <Element a at 0x1b79f581200>, <Element a at 0x1b79f581c00>, <Element a at 0x1b79f581a40>, <Element a at 0x1b79f581180>, <Element a at 0x1b79f5d13c0>, <Element a at 0x1b79f5d1840>, <Element a at 0x1b79c2fbc40>, <Element a at 0x1b79c2fb440>, <Element a at 0x1b79c2fb700>, <Element a at 0x1b79c2fb980>, <Element a at 0x1b79c2fb580>, <Element a at 0x1b79c2fb180>, <Element a at 0x1b79f637780>, <Element a at 0x1b79f637300>, <Element a at 0x1b79f637500>, <Element a at 0x1b79f6373c0>, <Element a at 0x1b79f5f8200>, <Element a at 0x1b79f5ac840>, <Element a at 0x1b79f5acac0>, <Element a at 0x1b79f5ac580>, <Element a at 0x1b79f5acec0>, <Element a at 0x1b79f5ac3c0>, <Element a at 0x1b79f5ac500>, <Element a at 0x1b79f5acbc0>, <Element a at 0x1b79c2cd200>, <Element a at 0x1b79c2cd840>, <Element a at 0x1b79c2cd8c0>, <Element a at 0x1b79c2cd440>, <Element a at 0x1b79c2cd900>, <Element a at 0x1b79c2cdec0>, <Element a at 0x1b79f5a0840>, <Element a at 0x1b79f5a0f40>, <Element a at 0x1b79db92e80>, <Element a at 0x1b79f5d3dc0>, <Element a at 0x1b79f5d3040>, <Element a at 0x1b79dbb9880>, <Element a at 0x1b79f5c1180>, <Element a at 0x1b79f5c1280>, <Element a at 0x1b79f63aec0>, <Element a at 0x1b79dbb4c80>, <Element a at 0x1b79f2dd180>, <Element a at 0x1b79f2f4440>, <Element a at 0x1b79f61ce80>, <Element a at 0x1b79f56bc80>, <Element a at 0x1b79f56bb00>, <Element a at 0x1b79f56b700>, <Element a at 0x1b79f56b300>, <Element a at 0x1b79f5e0d40>, <Element a at 0x1b79f5e0b80>, <Element a at 0x1b79f5e0c00>, <Element a at 0x1b79f5e0980>, <Element a at 0x1b79f5e5480>, <Element a at 0x1b79f5e5ec0>, <Element a at 0x1b79f5f3240>, <Element a at 0x1b79f5f3600>, <Element a at 0x1b79f584840>, <Element a at 0x1b79f5fa640>, <Element a at 0x1b79f624980>, <Element a at 0x1b79f5a4440>, <Element a at 0x1b79f5a4e40>, <Element a at 0x1b79f5a4a80>, <Element a at 0x1b79f5a4f40>, <Element a at 0x1b79f5a43c0>, <Element a at 0x1b79f5a4ec0>, <Element a at 0x1b79f5a4bc0>, <Element a at 0x1b79f5a4300>, <Element a at 0x1b79f570cc0>]\n"
     ]
    }
   ],
   "source": [
    "print(rets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "139a67f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "datas = []\n",
    "\n",
    "for ret in rets:\n",
    "    attrib = ret.attrib\n",
    "    text = re.sub(r'[\\t,\\n,\\r]+', '', ret.text)\n",
    "    href = domain + attrib['href']\n",
    "    d = dict(title=text, href=href)\n",
    "    datas.append(d)\n",
    "\n",
    "df_datas = pd.DataFrame(datas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "d7a0916e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          title                                               href\n",
      "0       HTML 教程     https://www.runoob.com/html/html-tutorial.html\n",
      "1       HTML 简介        https://www.runoob.com/html/html-intro.html\n",
      "2      HTML 编辑器      https://www.runoob.com/html/html-editors.html\n",
      "3       HTML 基础        https://www.runoob.com/html/html-basic.html\n",
      "4       HTML 元素     https://www.runoob.com/html/html-elements.html\n",
      "..          ...                                                ...\n",
      "69  HTML URL 编码    https://www.runoob.com/tags/html-urlencode.html\n",
      "70    HTML 语言代码  https://www.runoob.com/tags/html-language-code...\n",
      "71      HTTP 消息  https://www.runoob.com/tags/html-httpmessages....\n",
      "72      HTTP 方法  https://www.runoob.com/tags/html-httpmethods.html\n",
      "73        键盘快捷键  https://www.runoob.com/tags/html-keyboardshort...\n",
      "\n",
      "[74 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "print(df_datas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "f2f8f26e",
   "metadata": {},
   "outputs": [],
   "source": [
    "writer = pd.ExcelWriter('5_6homework.xlsx')\n",
    "df_datas.to_excel(writer, sheet_name='5_6homework', index = False)\n",
    "writer.save()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
